## Download 2006 Canadian Census DAs with population and boundaries and do some basic recoding

library(here)
library(sf)
library(cancensus)
library(dplyr)

## Set the CensusMapper API key here (for the cancensus package)
apikey <- ""

tryCatch(
  {
    set_cancensus_cache_path(".", install = TRUE)
    message("Cache path set successfully.")
  },
  error = function(e) {
    message("the cache path is already set")
  }
)


tryCatch(
  {
    set_cancensus_api_key(apikey, install = TRUE)
    message("The API key has been set")
  },
  error = function(e) {
    message("The API key is already set")
  }
)


## list_census_datasets() %>% print(n = 100)
ca06vars <- list_census_vectors("CA06")

## # A tibble: 8 × 7
##   vector      type  label                                       units              parent_vector aggregation           details
##   <chr>       <fct> <chr>                                       <fct>              <chr>         <chr>                 <chr>
## 1 v_CA06_253  Total Official language minority - (number)       Number             NA            Additive              Census 2006;…
## 2 v_CA06_254  Total Official language minority - (percentage)   Percentage (0-100) NA            Average of v_CA06_248 Census 2006;…
## 3 v_CA06_1302 Total Total population by visible minority groups Number             NA            Additive              Census 2006;…
## 4 v_CA06_1303 Total Total visible minority population           Number             v_CA06_1302   Additive              Census 2006;…
## 5 v_CA06_1314 Total Visible minority, n.i.e.                    Number             v_CA06_1303   Additive              Census 2006;…
## 6 v_CA06_1315 Total Multiple visible minority                   Number             v_CA06_1303   Additive              Census 2006;…
## 7 v_CA06_1316 Total Not a visible minority                      Number             v_CA06_1302   Additive              Census 2006;…
## 8 v_CA06_107  Total Minor repairs                               Number             v_CA06_105    Additive              Census 2006;…


## ca06regions <- list_census_regions("CA06")

ca06vars %>% filter(label %in% grep("minor", ca06vars$label, ignore.case = TRUE, value = TRUE))
ca06vars %>% filter(label %in% grep("chin", ca06vars$label, ignore.case = TRUE, value = TRUE))
ca06vars %>% filter(label %in% grep("black", ca06vars$label, ignore.case = TRUE, value = TRUE))
ca06vars %>% filter(label %in% grep("latin", ca06vars$label, ignore.case = TRUE, value = TRUE))
ca06vars %>% filter(label %in% grep("asian", ca06vars$label, ignore.case = TRUE, value = TRUE))

ca06vars %>%
  filter(label %in% grep("asian", ca06vars$label, ignore.case = TRUE, value = TRUE)) %>%
  select(vector) %>%
  unlist()

ca06vars %>%
  filter(grepl("minor", ca06vars$details, ignore.case = TRUE)) %>%
  select(vector, label, parent_vector, details)

ca06vars %>%
  filter(grepl("black", ca06vars$details, ignore.case = TRUE)) %>%
  select(vector, label, parent_vector, details)

ca06vars %>%
  filter(vector == "v_CA06_1306") %>%
  unlist()

ca06vars %>%
  filter(grepl("latin", ca06vars$details, ignore.case = TRUE)) %>%
  select(vector, label, parent_vector, details) %>%
  print(n = 100)

ca06vars %>%
  filter(vector == "v_CA06_1306") %>%
  unlist()

ca06vars %>%
  filter(grepl("chin", ca06vars$details, ignore.case = TRUE)) %>%
  select(vector, label, parent_vector, details) %>%
  print(n = 100)

ca06vars %>%
  filter(grepl("south", details, ignore.case = TRUE) & grepl("visibl", details, ignore.case = TRUE)) %>%
  select(vector, label, parent_vector, details) %>%
  print(n = 100)

ca06vars %>%
  filter(grepl("fili|korea|japan|west|south", details, ignore.case = TRUE) & grepl("visibl", details, ignore.case = TRUE)) %>%
  select(vector, label, parent_vector, details) %>%
  print(n = 100)

vm_vars0 <- ca06vars %>%
  filter(parent_vector == "v_CA06_1303") %>%
  select(vector, label, parent_vector, details)

vm_vars1 <- c(
  unlist(vm_vars0$vector),
  "v_CA06_1302", "v_CA06_1303", "v_CA06_1314", "v_CA06_1315", "v_CA06_1316",
  "v_CA06_1132", "v_CA06_1304", "v_CA06_1537", "v_CA06_1534",
  "v_CA06_1306", "v_CA06_1449", "v_CA06_1442",
  "v_CA06_1308", "v_CA06_1362", "v_CA06_1384", "v_CA06_1317", "v_CA06_1362",
  "v_CA06_1305", "v_CA06_1309", "v_CA06_1311", "v_CA06_1507", "v_CA06_1519",
  "v_CA06_1520", "v_CA06_1533", "v_CA06_1534", "v_CA06_1552", "v_CA06_1553"
)


# n.i.e means not included elsewhere

vm_vars <- ca06vars %>%
  filter(vector %in% unique(vm_vars1)) %>%
  select(vector, type, label, parent_vector, details)

vm_vars_vec <- unlist(vm_vars$vector)

# Return an sf-class data frame
census_data <- get_census(
  dataset = "CA06",
  regions = list(C = "Canada"),
  level = "DA",
  vectors = c("v_CA06_1", vm_vars_vec),
  geo_format = "sf",
  resolution = "high"
)

## Rename some variables and calculate proportion visible minority
# new_names <- sapply(strsplit(names(census_data), split = ":", fixed = TRUE), function(x) x[[1]])
# names(census_data)[1:(ncol(census_data)-1)] <- new_names[1:(ncol(census_data)-1)]

census_data <- census_data %>% rename(
  "da_area_kmsq_06" = "Area (sq km)",
  "da_pop_06" = "v_CA06_1: Population, 2006 - 100% data",
  "da_pop_20pct_06" = "v_CA06_1302: Total population by visible minority groups - 20% sample data",
  "da_vm_pop_06" = "v_CA06_1303: Total visible minority population",
  "da_vm_pop_nie_06" = "v_CA06_1314: Visible minority, n.i.e.", ## n.i.e. means not included elsewhere
  "da_vm_pop_mult_06" = "v_CA06_1315: Multiple visible minority",
  "da_non_vm_pop_06" = "v_CA06_1316: Not a visible minority",
  "da_black" = "v_CA06_1306: Black", # parent vector is v_CA06_1303, Total Vis min
  "da_black_1449" = "v_CA06_1449: Black", # parent vector is v_CA06_1442, African Origins
  "da_african_origins" = "v_CA06_1442: African origins",
  "da_latin" = "v_CA06_1308: Latin American", # parent vector is v_CA06_1303
  "da_chinese" = "v_CA06_1304: Chinese",
  "da_south_asian" = "v_CA06_1305: South Asian",
  "da_filipino" = "v_CA06_1307: Filipino",
  "da_southeast_asian" = "v_CA06_1309: Southeast Asian",
  "da_west_asian" = "v_CA06_1311: West Asian",
  "da_korean" = "v_CA06_1312: Korean",
  "da_japanese" = "v_CA06_1313: Japanese"
)

## We will use the race/ethnicity variables that come from the visible
## minorities series whenever possible.
##
## census_data %>%
##   filter(da_black_1306 > 0 | da_black_1449 > 0) %>%
##   select(da_black_1306, da_black_1449, da_african_origins, da_vm_pop_06, da_pop_20pct_06) %>%
##   head(n = 20)


## from the old codebooks

## COL10 - Total visible minority population / Filipino
## COL12 - Total visible minority population / Southeast Asian
## COL14 - Total visible minority population / West Asian
## COL15 - Total visible minority population / Korean
## COL16 - Total visible minority population / Japanese
## from survey.geo.merge.R
##
## survey.geo$DAOtherAsian<-c(survey.geo$DACOL10+survey.geo$DACOL12+
##                              + survey.geo$DACOL14+survey.geo$DACOL15+survey.geo$DACOL16)/survey.geo$DACOL04
## survey.geo$DAChinese<-survey.geo$DACOL07/survey.geo$DACOL04
## survey.geo$DABlack<-survey.geo$DACOL09/survey.geo$DACOL04
## survey.geo$DALatin<-survey.geo$DACOL11/survey.geo$DACOL04
## survey.geo$DASouthAsian<-survey.geo$DACOL08/survey.geo$DACOL04
##
## survey.geo$obj.diversity.index.da <- survey.geo$DAOtherAsian^2 +
##   survey.geo$DAChinese^2 +
##   survey.geo$DABlack^2 +
##   survey.geo$DALatin^2 +
##   survey.geo$DASouthAsian^2
##
## summary(survey.geo$obj.diversity.index)

summary(census_data)

census_data <- census_data %>% mutate(
  da_prop_vm_100pct_06 = da_vm_pop_06 / da_pop_06,
  da_prop_vm_20pct_06 = da_vm_pop_06 / da_pop_20pct_06,
  da_pop_dens_06 = da_pop_06 / da_area_kmsq_06,
  da_other_asian = (da_filipino + da_japanese + da_korean + da_west_asian + da_southeast_asian),
  da_prop_black = da_black / da_pop_20pct_06,
  da_prop_latin = da_latin / da_pop_20pct_06,
  da_prop_south_asian = da_south_asian / da_pop_20pct_06,
  da_prop_chinese = da_chinese / da_pop_20pct_06,
  da_prop_other_asian = da_other_asian / da_pop_20pct_06
)

## Checking that the mutate step did the right thing
census_data %>%
  filter(da_other_asian > 0 & da_filipino > 0 & da_korean > 0) %>%
  select(da_filipino, da_korean, da_japanese, da_southeast_asian, da_west_asian, da_other_asian, da_pop_20pct_06, da_vm_pop_06)

summary(census_data$da_prop_vm_20pct_06)
summary(census_data$da_prop_vm_100pct_06)
## Note that some are missing. I don't know what this means when it comes to our data. We will discover later
## Also a few places with more than 100% almost surely because of hiding true numbers (notice that they all end in 5 or 0?)
table(census_data$da_prop_vm_100pct_06 > 1)
table(census_data$da_prop_vm_20pct_06 > 1)

## Check to see about proportions greater than 1
census_data %>%
  select(contains("prop")) %>%
  summary()

census_data %>%
  filter(da_prop_vm_100pct_06 > 1) %>%
  select(da_pop_06, da_pop_20pct_06, da_vm_pop_06, da_non_vm_pop_06)

## Just recode down to 1 for ease of use later.
census_data <- census_data %>% mutate(
  da_prop_vm_100pct_06 = ifelse(da_prop_vm_100pct_06 > 1, 1, da_prop_vm_100pct_06),
  da_prop_vm_20pct_06 = ifelse(da_prop_vm_20pct_06 > 1, 1, da_prop_vm_20pct_06)
)

## Calculate the herfindahl index

## survey.geo$obj.diversity.index.da <- survey.geo$DAOtherAsian^2 +
##   survey.geo$DAChinese^2 +
##   survey.geo$DABlack^2 +
##   survey.geo$DALatin^2 +
##   survey.geo$DASouthAsian^2

census_data <- census_data %>% mutate(da_diversity_index = da_prop_other_asian^2 +
  da_prop_chinese^2 +
  da_prop_black^2 +
  da_prop_latin^2 +
  da_prop_south_asian^2)

## all of the missing data is the same.
## a da either has valid info on all of them or none of them
table(is.na(census_data$da_black), is.na(census_data$da_chinese))
table(is.na(census_data$da_black), is.na(census_data$da_latin))
table(is.na(census_data$da_black), is.na(census_data$da_other_asian))
table(is.na(census_data$da_black), is.na(census_data$da_south_asian))

summary(census_data$da_diversity_index)
quantile(census_data$da_diversity_index, seq(0, 1, .1), na.rm = TRUE)

## A lot of places have no diversity
table(census_data$da_diversity_index == 0)

## Look at vm
table(census_data$da_prop_vm_100pct_06 == 0)

save(census_data, file = here("Data/CensusData/2006_Data/", "census_data_06.rda"))

## Now get CSD level data

census_data_csd_06 <- get_census(
  dataset = "CA06",
  regions = list(C = "Canada"),
  level = "CSD",
  vectors = c("v_CA06_1", "v_CA06_1302", "v_CA06_1303", "v_CA06_1314", "v_CA06_1315", "v_CA06_1316"),
  geo_format = "sf",
  resolution = "high"
)

census_data_csd_06 <- census_data_csd_06 %>% rename(
  "csd_area_kmsq_06" = "Area (sq km)",
  "csd_pop_06" = "v_CA06_1: Population, 2006 - 100% data",
  "csd_pop_20pct_06" = "v_CA06_1302: Total population by visible minority groups - 20% sample data",
  "csd_vm_pop_06" = "v_CA06_1303: Total visible minority population",
  "csd_vm_pop_nie_06" = "v_CA06_1314: Visible minority, n.i.e.", ## n.i.e. means not included elsewhere
  "csd_vm_pop_mult_06" = "v_CA06_1315: Multiple visible minority",
  "csd_non_vm_pop_06" = "v_CA06_1316: Not a visible minority"
)

summary(census_data_csd_06)

census_data_csd_06 <- census_data_csd_06 %>% mutate(
  csd_prop_vm_100pct_06 = csd_vm_pop_06 / csd_pop_06,
  csd_prop_vm_20pct_06 = csd_vm_pop_06 / csd_pop_20pct_06
)
summary(census_data_csd_06$csd_prop_vm_20pct_06)
summary(census_data_csd_06$csd_prop_vm_100pct_06)

## Note that some are missing. I don't know what this means when it comes to our data. We will discover later

## Code urbana versus rural csds from
## https://www12.statcan.gc.ca/census-recensement/2006/ref/dict/geo049-eng.cfm
##
## Urban area (UA) Modified on September 5, 2007
##
## Part A - Plain language definition: Area with a population of at least 1,000
## and no fewer than 400 persons per square kilometre.
##
## Part B - Detailed definition:
##
## An urban area has a minimum population concentration of 1,000 persons and a
## population density of at least 400 persons per square kilometre, based on
## the current census population count. All territory outside urban areas is
## classified as rural. Taken together, urban and rural areas cover all of
## Canada.
##
## Urban population includes all population living in the urban cores,
## secondary urban cores and urban fringes of census metropolitan areas (CMAs)
## and census agglomerations (CAs)>, as well as the population living in urban
## areas outside CMAs and CAs.

census_data_csd_06 <- census_data_csd_06 %>% mutate(
  csduid_census_06 = GeoUID,
  csd_pop_dens_06 = csd_pop_06 / csd_area_kmsq_06,
  csd_urban_06 = ifelse(csd_pop_06 >= 1000 & csd_pop_dens_06 >= 400, 1, 0)
)

save(census_data_csd_06, file = here("Data/CensusData/2006_Data/", "census_data_csd_06.rda"))

## Check to see that this looks reasonable
## library(ggplot2)
## ## Some issues finding proj.db for projections
## ## this next is not perfectly portable
## Sys.setenv(PROJ_LIB = "/opt/homebrew/Cellar/proj/9.4.0/share/proj")
## Sys.getenv("PROJ_LIB")
## ## [1] "/opt/homebrew/Cellar/proj/9.4.0/share/proj"
## ggplot() +
##   geom_sf(data = census_data_csd_06) +
##   theme_minimal()

system("touch census_data_2006.done")
